library(Seurat)
library(SnapATAC)
## Loading required package: Matrix
## Loading required package: rhdf5
library(tidyverse)
## ── Attaching packages ─────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ tidyr::pack()   masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(DescTools)  # 4 AUC function
## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata
library(glue)
## 
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
## 
##     collapse
library(ggalluvial)  # 4 river plot
library(ggpubr)
## Loading required package: magrittr
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
source("~/multiOmic_benchmark/utils.R")
source("~/multiOmic_benchmark/KNN_agreement.R")


gg_color_hue <- function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

## Make output directory
outdir <- "~/multiOmic_benchmark/report/output/20191113_labelTransferEDA_F74_v2_bgmat_reference/"
ifelse(!dir.exists(outdir), dir.create(outdir), FALSE)
## [1] FALSE
seu.cca <- readRDS("~/models2/labelTransferCCA_F74_bgmat_referenceHVGFeatures.RDS")
seu.liger <- readRDS("~/models2/labelTransferLiger_F74_bgmat_referenceHVGFeatures.RDS")
seu.conos <- readRDS("~/models2/labelTransferConos_F74_bgmat_referenceHVGFeatures.RDS")

integrate_features <- scan("~/my_data/intFeatures_referenceHVG_F74_bgmat.txt", what='')

int.list <- list(CCA=seu.cca, Liger=seu.liger, Conos=seu.conos)

# ## Make method color palette
# method.palette <- brewer_palette_4_values(names(int.list), "Set1")

Embeddings

Visualize label transfer on original ATAC data (embedded SnapATAC bins)

## Load original data
orig.ATAC <- readRDS("~/my_data/F74_ATAC_snapAtac_processed_bgmat.RDS")
orig.RNA <- readRDS("~/my_data/F74_RNA_seurat_processed.RDS")

## Make SeuratObjects
# atac.seu <- snapToSeurat(
#     obj=orig.ATAC, 
#     eigs.dims=1:16, 
#     norm=TRUE,
#     scale=TRUE
#     )
# atac.seu <- RenameCells(atac.seu, new.names = orig.ATAC@metaData$barcode)
atac.seu <- as.Seurat(orig.ATAC, counts = "counts", data="logcounts")
## Warning: All keys should be one or more alphanumeric characters followed by
## an underscore '_', setting key to LSI_
## Warning: All keys should be one or more alphanumeric characters followed by
## an underscore '_', setting key to UMAP_
## Add cell type predictions
pred.cca <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max")
pred.liger <- getPredictedLabels(seu.liger, "Liger")
pred.conos <- getPredictedLabels(seu.conos, "Conos")

if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
  atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca, pred.liger, pred.conos))
} else {
  stop("Non corresponding cell names")
}
## make cell type palette
cell.types <- levels(seu.cca$RNA$annotation)
cell.type.pal <- brewer_palette_4_values(cell.types, palette = "Set1") %>% setNames(cell.types)
# atac.seu <- RunUMAP(atac.seu, reduction = "SnapATAC", reduction.name = "umap.snap", dims=1:16)

## Embedding RNA
orig.RNA.seu <- as.Seurat(orig.RNA)
orig.RNA.seu <- FindVariableFeatures(orig.RNA.seu)
orig.RNA.seu <- ScaleData(orig.RNA.seu)
## Centering and scaling data matrix
orig.RNA.seu <- RunPCA(orig.RNA.seu)
## PC_ 1 
## Positive:  TRBC2, TRBC1, HMGA1, HIST1H1C, HIST1H3H, ITM2A, HIST1H2BJ, SMIM24, TRAV13-2, FXYD2 
##     TRBV7-2, PCGF5, HIST1H2BH, IL32, HIST1H2BN, CHAC1, RASD1, TRBV27, TRAV13-1, TRAV8-2 
##     TRAV8-4, PTPN6, SELL, HIST1H2BG, TAGAP, TRDC, TRAV38-2DV8, TRAV29DV5, TRBV9, TRAV41 
## Negative:  CALD1, COL5A2, COL6A1, COL6A2, SPARC, THY1, DCN, COL3A1, NFIB, SPARCL1 
##     TSHZ2, CPE, PLAC9, NID1, FKBP10, PTN, FLRT2, MAP1B, EFEMP2, BGN 
##     CXCL12, RBP1, LAMB1, AHNAK, COL1A1, COL5A1, FSTL1, LUM, LAMA4, MDK 
## PC_ 2 
## Positive:  SFRP1, NTRK2, PLAT, ISLR, NRK, SCARA5, ASPN, OSR1, OLFML3, MXRA8 
##     CAPN6, PTPRD, PLP1, TMEFF2, CREB3L1, DKK3, CERCAM, MMP2, EBF2, SMOC2 
##     CDO1, COL12A1, PDGFRA, LRRC17, THBS2, HTRA3, SFRP2, ANGPTL1, MAB21L1, MXRA5 
## Negative:  MKI67, CDK1, NUSAP1, TOP2A, CCNA2, RRM2, UBE2C, BIRC5, KIFC1, TYMS 
##     UBE2T, AURKB, CENPF, CENPM, CDCA8, TACC3, NCAPG, TPX2, ASF1B, CDKN3 
##     GTSE1, CDCA3, HJURP, SPC25, MAD2L1, CDC20, PLK1, DLGAP5, NUF2, KIF22 
## PC_ 3 
## Positive:  CCNA2, NUF2, GTSE1, CDCA8, UBE2T, AURKB, PBK, CDK1, NCAPG, NDC80 
##     KIFC1, CDCA3, HJURP, MAD2L1, SPC25, PLK1, KIF15, DEPDC1B, BIRC5, CDCA5 
##     CKS1B, RRM2, DLGAP5, HMMR, CENPA, KIF22, KIF20A, CDCA2, CENPF, KIF2C 
## Negative:  HLA-DRB5, HLA-DRA, TYROBP, HLA-DRB1, HLA-DPA1, HLA-DPB1, HLA-DQA1, C1QC, C1QB, RNASE1 
##     HLA-DQB1, A2M, C1QA, CSF1R, STAB1, HCK, HLA-DMB, FOLR2, SAMHD1, LYZ 
##     MS4A7, SPI1, CD36, MS4A6A, CYBB, TMEM176B, CD74, IGSF6, MPEG1, MS4A4A 
## PC_ 4 
## Positive:  GYPA, GYPB, NFE2, GYPE, ANK1, SLC4A1, RHAG, AHSP, DMTN, KLF1 
##     GATA1, TMOD1, TMEM56, HBZ, HBG1, GMPR, C17orf99, SMIM5, HBQ1, TSPO2 
##     ALAS2, PHOSPHO1, CR1L, TRIM58, HBM, EPB42, RHD, RHCE, SPTA1, SMIM1 
## Negative:  TMSB10, TRBC2, CCL21, IL32, CXCL13, TRBC1, APLNR, CCL19, MIF, HMGB1 
##     COX4I2, COL4A1, COL15A1, MADCAM1, FDCSP, CCL17, NTS, TNC, CDH5, FABP4 
##     CAV1, COL4A2, CRIP2, RGS5, KLRB1, MYLK, IFITM1, IL33, PAPLN, HPN 
## PC_ 5 
## Positive:  APLNR, CDH5, CAV1, COL15A1, CRIP2, COL4A1, CCL21, KDR, CLDN5, MADCAM1 
##     FABP4, IL33, COL4A2, CXCL13, TM4SF1, PODXL, CCL19, COX4I2, ESAM, BCAM 
##     NTS, PAPLN, SPNS2, MYLK, C8orf4, ADGRF5, TM4SF18, TGM2, RP11-536O18.1, CXorf36 
## Negative:  CSF1R, MS4A4A, CYBB, PLD4, MS4A6A, CD163, HCK, IGSF6, FOLR2, ADAP2 
##     CD86, MS4A7, MARCH1, MRC1, F13A1, MPEG1, CD14, SPI1, HLA-DMB, GPR34 
##     CD33, TGFBI, CLEC7A, TIMD4, CEBPA, SIGLEC1, CSF2RA, SLC15A3, LY86, AGR2
orig.RNA.seu <- RunUMAP(orig.RNA.seu, dims=1:40)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 17:30:04 UMAP embedding parameters a = 0.9922 b = 1.112
## 17:30:04 Read 8321 rows and found 40 numeric columns
## 17:30:04 Using Annoy for neighbor search, n_neighbors = 30
## 17:30:04 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 17:30:06 Writing NN index file to temp file /tmp/RtmpIfrtJS/file241756ada007
## 17:30:06 Searching Annoy index using 1 thread, search_k = 3000
## 17:30:09 Annoy recall = 100%
## 17:30:10 Commencing smooth kNN distance calibration using 1 thread
## 17:30:11 Initializing from normalized Laplacian + noise
## 17:30:12 Commencing optimization for 500 epochs, with 367644 positive edges
## 17:30:34 Optimization finished
ggpubr::ggarrange(
  plotlist = list(
    DimPlot(orig.RNA.seu, reduction = "umap", group.by = "annotation", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("RNA"),
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_CCA"  , cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("CCA"),
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_Liger", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Liger"),
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_Conos", cols=cell.type.pal, label=TRUE, repel=TRUE) + ggtitle("Conos")
  ),
  common.legend = TRUE, ncol=4, nrow=1
) +
  ggsave(paste0(outdir, "umap_labels.png"), width=17, height = 6)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.

Filter low confidence calls

pred.cca.filtered <- getPredictedLabels(seu.cca, "CCA", score.col = "prediction.score.max", filter_score = 0.5)
pred.liger.filtered <- getPredictedLabels(seu.liger, "Liger", filter_score = 0.5)
pred.conos.filtered <- getPredictedLabels(seu.conos, "Conos", filter_score = 0.5)

if (all(rownames(pred.conos) == rownames(pred.cca)) & all(rownames(pred.conos) == rownames(pred.liger))) {
  atac.seu <- AddMetaData(atac.seu, metadata = cbind(pred.cca.filtered, pred.liger.filtered, pred.conos.filtered))
} else {
  stop("Non corresponding cell names")
}

ggpubr::ggarrange(
  plotlist = list(
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_CCA", cols=cell.type.pal) + 
      scale_color_manual(values = cell.type.pal, na.value="grey80") +
      ggtitle("CCA"),
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_Liger", cols=cell.type.pal) + 
      scale_color_manual(values = cell.type.pal, na.value="grey80") + ggtitle("Liger"),
    DimPlot(atac.seu, reduction = "UMAP", group.by = "predicted.id_Conos", cols=cell.type.pal) + 
      scale_color_manual(values = cell.type.pal, na.value="grey80") + ggtitle("Conos"),
    DimPlot(orig.RNA.seu, reduction = "umap", group.by = "annotation", cols=cell.type.pal) + ggtitle("RNA") +
      scale_color_manual(values = cell.type.pal, na.value="grey80")
  ),
  common.legend = TRUE, ncol=4, nrow=1
) +
  ggsave(paste0(outdir, "umap_labels_filtered.png"), width=16, height = 6)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

Prediction score

Quantifies the uncertainty of the prediction. Calculated differently for every method, but used to define which cells are “unassigned”.

orig.composition <- orig.RNA$annotation
orig.frac <- table(orig.composition)/length(orig.composition)

orig.frac.df <- data.frame(orig.frac) %>%
  dplyr::rename(predicted.id=orig.composition, frac.label=Freq) %>%
  mutate(method="original.RNA")

score_cols <- str_subset(colnames(atac.seu@meta.data), 'score_')
label_cols <- str_subset(colnames(atac.seu@meta.data), 'predicted.id_')

pred.labels.df <- imap(list(CCA=pred.cca, Liger=pred.liger, Conos=pred.conos), ~ 
      rownames_to_column(.x, "cell") %>%
      rename_all(funs(str_remove(., str_c("_",.y)))) %>%
      mutate(method=.y)
    ) %>%
  purrr::reduce(bind_rows) %>%
  mutate(score=ifelse(is.na(score), 0, score))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
predict_score_hist <- 
  pred.labels.df %>%
  ggplot(aes(score, fill=method)) +
  geom_histogram(position="identity", alpha=0.8, bins=40) +
  facet_grid(method ~., scales = "free_y") +
  scale_fill_brewer(palette="Set1") +
  xlab("Label prediction score") +
  theme_bw(base_size = 16) +
  theme(legend.position = "top")

cutoffs <- seq(0,1,0.05)
predict_score_cumedist <-
  pred.labels.df %>%
  group_by(method) %>%
  mutate(bins=cut(score, breaks = cutoffs)) %>%
  mutate(score=as.numeric(str_remove_all(as.character(bins), ".+,|]"))) %>%
  ggplot(aes(score, color=method)) +
  stat_ecdf(size=0.8, alpha=0.7) +
  scale_color_brewer(palette = "Set1") +
  ylab("Fraction of unassigned cells") +
  xlab("Prediction score cutoff") +
  theme_bw(base_size = 16) +
  xlim(0,1) +
  coord_fixed() +
  guides(color="none") 

ggpubr::ggarrange(predict_score_hist, predict_score_cumedist, common.legend = TRUE, widths = c(0.8, 1.2),
          labels=c("A", "B")) +
  ggsave(paste0(outdir, "prediction_score_distribution.png"), height = 6, width = 10)
## Warning: Removed 66 rows containing non-finite values (stat_ecdf).

ggpubr::ggarrange(
  plotlist = list(
    FeaturePlot(atac.seu, reduction = "UMAP", feature = "score_CCA"  , coord.fixed = TRUE) + scale_color_viridis_c() + ggtitle("CCA"),
    FeaturePlot(atac.seu, reduction = "UMAP", feature = "score_Liger", coord.fixed = TRUE) + scale_color_viridis_c() + ggtitle("Liger"),
    FeaturePlot(atac.seu, reduction = "UMAP", feature = "score_Conos", coord.fixed = TRUE) + scale_color_viridis_c() + ggtitle("Conos")
  ),
  common.legend = TRUE, ncol=3, nrow=1
) +
  ggsave(paste0(outdir, "prediction_score_umaps.png"), height = 7, width=14)
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.
## Scale for 'colour' is already present. Adding another scale for
## 'colour', which will replace the existing scale.

pred.labels.df %>%
  group_by(method, predicted.id) %>%
  mutate(median.score=median(score), size=n()) %>%
  ungroup() %>%
  group_by(method) %>%
  mutate(rank = dense_rank(median.score)) %>%
  ungroup() %>%
  # ggplot(aes(size, median.score)) + geom_point(aes(color=method)) +
  ggplot(aes(as.factor(rank), score, fill=predicted.id)) +
  # geom_violin() +
  geom_boxplot(varwidth = F) +
  # ggbeeswarm::geom_quasirandom(alpha=0.3) +
  # geom_point(aes(y=median.score)) +
  # stat_ecdf() +
  facet_wrap(method~., ncol=3, scales="free_x") +
  scale_fill_manual(values=cell.type.pal)

Cell type composition

Compare cell type fractions (w uncertainty)

orig.rank.df <- orig.frac.df %>% 
  mutate(orig.rank=dense_rank(frac.label)) %>%
  select(orig.rank, predicted.id) %>%
  distinct() %>%
  arrange(orig.rank) %>%
  column_to_rownames("predicted.id") 

pred.labels.df %>%
  group_by(method) %>%
  drop_na() %>%
  mutate(tot.cells=n()) %>%
  ungroup() %>%
  group_by(method, predicted.id) %>%
  summarise(tot.label = n(), tot.cells = max(tot.cells), mean.score=mean(score)) %>%
  mutate(frac.label=tot.label/tot.cells) %>%
  bind_rows(orig.frac.df) %>%
  mutate(orig.rank = orig.rank.df[predicted.id,]) %>%
  mutate(predicted.id=factor(predicted.id, levels=rownames(orig.rank.df)))%>%
  # select(method, predicted.id, frac.label) %>%
  # distinct() %>%
  ggplot(aes(predicted.id, frac.label, fill=mean.score, color=mean.score)) +
  geom_point(size=2) +
  geom_col(width=0.05) +
  coord_flip() +
  # geom_line(aes(group=method)) +
  facet_wrap(method~., nrow=1, ncol=4, scales="free_x") +
  scale_color_viridis_c() +
  scale_fill_viridis_c() +
  ylab("Fraction of cells") +
  theme_bw(base_size = 16) +
  ggsave(paste0(outdir, "cell_type_composition_bars.png"), width = 15, height = 5)
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

Agreement with unsupervised clustering of ATAC data

Calculate which fractions of NNs in bin based graph of ATAC cells have the same annotation

k = 30
atac.seu <- FindNeighbors(atac.seu, assay = "ATAC", reduction = "LSI", dims = 1:16, k.param = k)
## Computing nearest neighbor graph
## Computing SNN
atac.nn.list <- getNNlist(atac.seu)

knn.score.CCA <- test.knn(atac.nn.list, setNames(pred.cca.filtered$predicted.id_CCA, rownames(pred.cca.filtered)))
## Warning in ks.test(x = true, y = null, alternative = "less"): p-value will
## be approximate in the presence of ties
knn.score.conos <- test.knn(atac.nn.list, setNames(pred.conos.filtered$predicted.id_Conos, rownames(pred.conos.filtered)))
## Warning in ks.test(x = true, y = null, alternative = "less"): p-value will
## be approximate in the presence of ties
knn.score.liger <- test.knn(atac.nn.list, setNames(pred.liger.filtered$predicted.id_Liger, rownames(pred.liger.filtered)))
## Warning in ks.test(x = true, y = null, alternative = "less"): p-value will
## be approximate in the presence of ties
knn_score_df <-
  list(CCA=knn.score.CCA, conos=knn.score.conos, liger=knn.score.liger) %>%
  imap( ~ data.frame(KNN_score = .x$KNN_score, D=.x$D, p.val=.x$p.val, method=.y)) %>%
  # imap( ~ data.frame(KNN_score = .x$KNN_score, cell= names(.x$KNN_score), D=.x$D, p.val=.x$p.val, method=.y)) %>%
  purrr::reduce(bind_rows) %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score)) %>%
  mutate(data="true")
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
knn_score_null_df <-
  list(CCA=knn.score.CCA, conos=knn.score.conos, liger=knn.score.liger) %>%
  imap( ~ data.frame(KNN_score = .x$null, D=.x$D, p.val=.x$p.val, method=.y)) %>%
  # imap( ~ data.frame(KNN_score = .x$KNN_score, cell= names(.x$KNN_score), D=.x$D, p.val=.x$p.val, method=.y)) %>%
  purrr::reduce(bind_rows) %>%
  dplyr::mutate(KNN_score=ifelse(is.na(KNN_score), 0, KNN_score)) %>%
  mutate(data="null")
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
bind_rows(knn_score_df, knn_score_null_df) %>%
  ggplot(aes(KNN_score, color=method)) +
  stat_ecdf( aes(alpha=data), size=1) +
  # stat_ecdf(data=. %>% filter(data=="true"), size=1) +
  facet_grid(method~.) +
  scale_alpha_discrete( range=c(0.5,1), name="") +
  scale_color_brewer(palette = "Set1") +
  geom_text(data= . %>% distinct(method, D, p.val), 
            x=1, y=0.05, hjust=1,
            aes(label=glue("KNN score = {round(D, 3)}, p.value: {p.val}"), y=c(0.90, 0.95, 1))) +
  theme_bw(base_size = 16) +
  ylab("ECDF") + xlab("Fraction of KNNs with shared label") +
  ggsave(paste(outdir,"KNN_score_ecdf_unionHVG.png"), height = 6, width=7)
## Warning: Using alpha for a discrete variable is not advised.

Accessibility of markers

Taking markers from Fig. S2 of JP’s manuscript

thymus.markers <- c("PTPRC", "CD3G", "TYROBP","CD19","HOXA9",'FXYD2',"SH3TC1","CCR9","CD8A", "CD8B","PDCD1", "CRTAM","CD40LG","CCR6","FOXP3","SOX13","ZNF683","KLRD1","TNFSF11","VPREB1","MS4A1", "CLEC9A", "CLEC10A", "LAMP3", "IL3RA", "FCGR3B", "C2","TPSB2",
                    'ITGA2B',"GYPA", "CDH5", "RGS5","CDH1", "PDGFRA","CRABP1")
# pbmc.markers <- c("CD79A", "MS4A1", "CD8A", "CD8B", "LYZ")
# thymus.markers <- list(Fb=c("PDGFRA", "COLEC11", "FBN1", "PI16"),
#                        VSMC=c("PDGFRB", 'ACTA2', "RGS5"),
#                        Endo=c("PECAM1", "CDH5","LYVE1"),
#                        TEC = c("EPCAM", "FOXN1", "CCL25", "CCL19")
#                        )
thymus.markers.df <- imap(thymus.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
  purrr::reduce(bind_rows)

marker.access.df <- atac.seu@assays$RNA@data[intersect(thymus.markers, rownames(atac.seu@assays$RNA)),] %>%
  as.matrix() %>%
  reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
  full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
  # full_join(thymus.markers.df) %>%
  pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
  dplyr::mutate(method=str_remove(method,".+_")) %>%
  filter(method %in% c("CCA", "Liger", "Conos")) 

ordered_cell_types <- c("DN", "DP (Q)", "DP (P)", "SP (1)", "NK", "ILC3", "DC", "Mac", "Ery", "Fib")

markers_pl <- 
  marker.access.df %>%
  mutate(predicted.id = case_when(str_detect(predicted.id, "CD8") ~ "CD8+T",
                                  # str_detect(predicted.id, "CD4") ~ "CD4+T",
                                  TRUE ~ predicted.id
                                  )
         ) %>%
  mutate(predicted.id=factor(predicted.id, levels = ordered_cell_types)) %>%
  group_by(method, predicted.id, gene) %>%
  dplyr::mutate(frac.cells=sum(log.counts > 0)/n()) %>%
  # filter(method=="CCA") %>%
  ungroup() %>%
  ggplot( aes( gene, predicted.id)) +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  facet_grid(method~., space="free", scales="free_x") +
  scale_color_gradient(high="darkblue", low="white") +
  # scale_color_viridis_c() +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.x = element_text(angle=45)) 

markers_pl 

ggsave(paste0(outdir, "Thymus_markers_accessibility.png"), height = 16, width = 12)

Reproducing Fig.2H on T-cell development

t.cell.markers <- list(known.markers = c("CD34", "IGLL1", "TRGC2", "TRDC", "PTCRA", "TRBC2", "TRAC", "CD4", "CD8A", "CD8B"),
                       chemokine.receptors = c("CCR9", "CCR7"),
                       tcr.activation = c("CD5", "CD27"),
                       proliferation=c("PCNA", "CDK1", "MKI67"),
                       cyclin.D = c("CCND2", "CCND3"),
                       recombination=c("RAG1", "RAG2"),
                       apoptosis=c("HRK","BMF", "TP53INP1"),
                       stage.markers = c("ST18", "HIVEP3", "RGPD3", "SMPD3", "AQP3", "RORC", "SATB1", "TOX2")
                       ) 
t.cell.markers.df <- imap(t.cell.markers, ~ data.frame(gene=.x, cell.type.class=.y)) %>%
  purrr::reduce(bind_rows)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
ordered.tcells <- c("DN", "DP (P)", "DP (Q)","SP (1)")

tcells.markers.df <- 
  atac.seu@assays$RNA@data[intersect(thymus.markers, rownames(atac.seu@assays$RNA)),] %>%
  as.matrix() %>%
  reshape2::melt(varnames=c("gene", "cell"), value.name="log.counts") %>%
  full_join(rownames_to_column(atac.seu@meta.data[, label_cols], "cell")) %>%
  pivot_longer(cols=label_cols, names_to = "method", values_to = "predicted.id") %>%
  dplyr::mutate(method=str_remove(method,".+_")) %>%
  filter(method %in% c("CCA", "Liger", "Conos")) %>%
  mutate(predicted.id=ifelse(str_detect(predicted.id, "CD8+"), "CD8+T", predicted.id)) %>%
  mutate(predicted.id=ifelse(str_detect(predicted.id, "CD4+"), "CD4+T", predicted.id)) %>%
  filter(predicted.id %in% ordered.tcells) %>%
  group_by(method, predicted.id, gene) %>%
  dplyr::mutate(frac.cells=sum(log.counts > 0)/n(), mean.acc=mean(log.counts)) %>%
  ungroup() 
## Joining, by = "cell"
## Warning: Column `cell` joining factor and character vector, coercing into
## character vector
tcells.markers.df %>%
  full_join(t.cell.markers.df) %>%
  # filter(method=="CCA") %>%
  mutate(predicted.id=factor(predicted.id, levels=ordered.tcells)) %>%
  ggplot(aes( predicted.id, gene)) +
  facet_grid(cell.type.class~method, scales = "free_y", space="free") +
  geom_point(aes(size=frac.cells, color=frac.cells)) +
  # scale_color_gradient(high="darkblue", low="white") +
  scale_color_viridis_c() +
  # scale_color_gradient2(midpoint = 0.5) +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5),
        strip.text.y = element_text(angle=0)) 
## Joining, by = "gene"
## Warning: Column `gene` joining factor and character vector, coercing into
## character vector
## Warning: Removed 29 rows containing missing values (geom_point).

ggsave(paste0(outdir, "tcell_markers.png"), height = 14, width = 14)
## Warning: Removed 29 rows containing missing values (geom_point).

Thoughts

  • Conos scores a lot of cells with high confidence, but fails to assign cells to difficult clusters
  • CCA resembles the composition of the RNA data better, but curious that the other methods identify way more